library(tidyverse)
library(readr)
library(fuzzyjoin)
library(car)
library(stringr)
library(maps)
setwd(rprojroot::find_root(rprojroot::is_rstudio_project))
#Load data
cces <- read.csv("Intermediate Data/CCES_merged_policy_responsiveness_missing_roll_call_relig.csv")
cong <- read.csv("Intermediate Data/HSall_members.csv")
rcv <- read.csv("Intermediate Data/roll_call_votes_for_merge.csv") #Used to be final_roll_call_votes.csv
#rcv <- read.csv("Backups/final_roll_call_votes.csv")

#editting state identifiers
#Makes state identifiers consistent across data sets
state.fips$fips <- as.character(state.fips$fips)
state.fips2 <- state.fips[,c(1,5)]
 state.fips2 <- unique(state.fips2)

non_cont <- data.frame(c("AL", "HI"), c(2,15))
colnames(non_cont) <- c("abb","fips")

state.fips2$fips <- as.numeric(state.fips2$fips)
state.fips2 <- bind_rows(state.fips2,non_cont)

cces$state <- as.character(cces$state)
cces$state_fips <- gsub("([0-9]+).*$", "\\1", cces$state)
cces$state_fips <- as.numeric(cces$state_fips)
#introduces 46,000 NAs

cces <- left_join(cces, state.fips2, by = c("state_fips"="fips"))

table(cces$abb)
sum(is.na(cces$abb))
cces$abb[is.na(cces$abb)] <- cces$state[is.na(cces$abb)]
sum(is.na(cces$abb))
#replaces NAs with state

table(cces$abb)

#subsetting congress data
cong <- cong[,c(1:7,10)]
cong <- subset(cong, congress >= 108)

#Joining rol call vote data with descriptive data. The unit of analysis / row divisions is each congressperson during each congress.
#This is done to track nominate scores in case they are potentially needed later on. But it might make sense to aggregate. Not sure.
merged <- right_join(rcv, cong, by = c( "name"="bioname", "state_abbrev", "icpsr"))

#Code congress
cces$congress[cces$year %in% c(2005,2006)] <- 109
cces$congress[cces$year %in% c(2007,2008)] <- 110
cces$congress[cces$year %in% c(2009,2010)] <- 111
cces$congress[cces$year %in% c(2011,2012)] <- 112
cces$congress[cces$year %in% c(2013,2014)] <- 113
cces$congress[cces$year %in% c(2015,2016)] <- 114
cces$congress[cces$year %in% c(2017,2018)] <- 115
cces$congress[cces$year %in% c(2019,2020)] <- 116
cces$congress[cces$year %in% c(2021,2022)] <- 117

#Editting names
# merged$last_name <- tolower(gsub("^(.*?),.*", "\\1", merged$name))
# merged$first_name <- tolower(sub("^.*?,", "", merged$name))
# merged$first_name <- gsub("\\.","",  merged$first_name)
# merged$first_name <- gsub('(?<=\\s)(\\w{1,2}\\s)','', merged$first_name, perl=T)
# 
# merged$first_name <- trimws(merged$first_name)
# merged$first_name_init <- substr(merged$first_name,1,1)
# 
# cces$house_name <- tolower(cces$house_name)
# cces$sen1_name <- tolower(cces$sen1_name)
# cces$sen2_name <- tolower(cces$sen2_name)
# 
# cces$house_name <- gsub("(.*),.*", "\\1", cces$house_name)
# cces$sen1_name <- gsub("(.*),.*", "\\1", cces$sen1_name)
# cces$sen2_name <- gsub("(.*),.*", "\\1", cces$sen2_name)
# 
# cces$house_last_name <- word(cces$house_name, -1)
# cces$sen1_last_name <- word(cces$sen1_name, -1)
# cces$sen2_last_name <- word(cces$sen2_name, -1)
# 
# cces$house_first_name_init <- substring(cces$house_name,1,1)
# cces$sen1_first_name_init <- substring(cces$sen1_name,1,1)
# cces$sen2_first_name_init <- substring(cces$sen2_name,1,1)
# 
# table(cces$sen1_name)

#replacing 0's and 9's with NA and 6's with 0'
indices = which(merged == 9, arr.ind = TRUE)
merged<-replace(merged, indices[names(merged)[indices[,2]]!= "district_code",], NA)
indices = which(merged == 0, arr.ind = TRUE)
merged<-replace(merged, indices[names(merged)[indices[,2]]!= "district_code",], NA)
indices = which(merged == 6, arr.ind = TRUE)
merged<-replace(merged, indices[names(merged)[indices[,2]]!= "district_code",], 0)
rm(indices)

#merge based on district code; not ideal, presumably not merging at large districts and maybe something else
#also duplicates if one district is represented by two MCs in a term
  #mark these items and ignore them for now

merged.house <- subset(merged, chamber=="House")
x<-merged.house %>% group_by(state_abbrev, congress, district_code) %>% mutate(rep = 1:n())
x<-x %>% group_by(state_abbrev, congress, district_code) %>% mutate(Rep1 = icpsr[rep == 1], 
                                                     Rep2 = ifelse(max(rep>1),icpsr[rep == 2], NA), 
                                                     Rep3 = ifelse(max(rep>2),icpsr[rep == 3], NA), 
                                                     Rep4 = ifelse(max(rep>3),icpsr[rep == 4], NA), 
                                                     Rep5 = ifelse(max(rep>4),icpsr[rep == 5], NA), 
                                                     Rep6 = ifelse(max(rep>5),icpsr[rep == 6], NA)) #NOT WORKING

df<-x %>%
  pivot_longer(cols = starts_with('V'), names_to="vote_name", values_to="vote") %>%
  mutate(rep.vote = paste0(rep, "_", vote_name)) %>% select(-c(1:4, 7:8, 10:11,18)) %>%
  pivot_wider(names_from = "rep.vote", values_from = "vote")

#house <- left_join(cces, subset(merged, chamber=="House"), by = c("cd"="district_code",  "abb"="state_abbrev", "congress"))
#table(!is.na(house$icpsr), house$congress)
#df <- select(df, -c(name, state_icpsr, vote_name))
house_cces <- left_join(cces, df, by = c("abb"="state_abbrev", "congress", "cd" = "district_code"))



#Senate merge
merged.sen <- subset(merged, chamber=="Senate")
x<-merged.sen %>% group_by(state_abbrev, congress) %>% mutate(senator = 1:n()) #create relative senator number

x<-x %>% group_by(state_abbrev, congress) %>% mutate(Sen1 = icpsr[senator == 1],
                                                       Sen2 = icpsr[senator == 2],
                                                       Sen3 = ifelse(max(senator>2),icpsr[senator == 3], NA),
                                                       Sen4 = ifelse(max(senator>3),icpsr[senator == 4], NA),
                                                     Sen5 = ifelse(max(senator>4),icpsr[senator == 5], NA),
                                                     Sen6 = ifelse(max(senator>5),icpsr[senator == 6], NA),
                                                     Sen7 = ifelse(max(senator>6),icpsr[senator == 7], NA),
                                                     Sen8 = ifelse(max(senator>7),icpsr[senator == 8], NA),
                                                     Sen9 = ifelse(max(senator>8),icpsr[senator == 9], NA))

df<-x %>%
  pivot_longer(cols = starts_with('V'), names_to="vote_name", values_to="vote") %>%
  mutate(sen.vote = paste0(senator, "_", vote_name)) %>% select(-c(1:4, 7:11,21)) %>%
  pivot_wider(names_from = "sen.vote", values_from = "vote")

#SAMPLE CODE

#assign one senator sen.1, one senator sen.2 arbitrarily, if three then last one gets sen 3
final_cces <- left_join(house_cces, df, by = c("abb"="state_abbrev", "congress"),
                        suffix=c(".hr", ".s"))

colnames(final_cces)

write_csv(final_cces, "Intermediate Data/merged_cces_rollcall.csv")
